//==============================================================================
// Project:		Wealth Transfers and their economic effects
// File name:	IWP 3
// Objective: 	Part 3 of IWP work - alternative imputation/estimation of parents' wealth at death and associated tests
//
//==============================================================================

clear all

************************
*** Data import ********
************************

cd "XXXX" //set content directory here

use combined_filtered_restricted.dta, replace

merge m:1 hhpxid wave using IWP_data_2.dta //from IWP 2.do, merged such that the variables are matched to the person's partner
drop if _merge == 2
drop _merge

merge m:1 xwaveid wave using individual_weight_vars_restricted.dta //individual wealth variables 
drop if _merge == 2
drop _merge

drop if hgint == 0 //drop non-responders

****************************************************
*** Create gender-free person-specific variables ***
****************************************************

*Note: variables created here (demographic characteristics of responding persons) are matched to those created about the parents of the child generation in IWP 1.do

gen net_worth = pwassei - pwdebti //individual net worth

gen yob_squ = yob^2 //year of birth squared
gen yob_cub = yob^3 //year of birth cubed

gen university = . //indicator for whether attended university
replace university = 1 if edhigh1 == 1 | edhigh1 == 2 | edhigh1 == 3 //went to university
replace university = 0 if edhigh1 == 4 | edhigh1 == 5 | edhigh1 == 8 | edhigh1 == 9 //didn't go to university

gen school = edhists //indicator for highest level of schooling
replace school = 3 if edhists >= 3 & edhists <= 6 // 1 is grade 12, 2 is grade 11, 3 is highschool but not higher than grade 10, 4 is primary school and special needs (these match the categories from IWP 1)
replace school = 4 if edhists >= 7 & edhists <= 9
replace school = . if edhists < 0 

gen birth_country = anbcob //indicator for country of birth
replace birth_country = . if anbcob == -4 //1 is Australia, 2 is main English speaking, 3 is other (these match the categories from IWP 1)

*****************************************************
*** Create equivalent variables about the partner ***
*****************************************************
*Note: these variables draw from IWP 2.do

egen yob_partner_max = max(yob_partner), by(xwaveid) //partner year of birth
gen yob_partner_max_squ = yob_partner_max^2 //partner year of birth squared
gen yob_partner_max_cub = yob_partner_max^3 //partner year of birth cubed
egen university_partner_max = max(university_partner), by(xwaveid) //indicator for whether partner attended university
egen school_partner_max = max(school_partner), by(xwaveid) //indicator for partner's highest level of schooling
egen birth_country_partner_max = max(birth_country_partner), by(xwaveid) // indicator for partner's country of birth
egen sex_partner_max = max(sex_partner), by(xwaveid) //indicator for partner's gender

*****************************************************************
*** Create separate male and female person-specific variables ***
*****************************************************************

*Male variables
*Note: If male same sex partners, take the male variable from the longer lived person. If female same sex partners, leave blank.

gen school_male = school if sex == 1
replace school_male = school_partner_max if sex == 2
replace school_male = . if sex == 2 & sex_partner_max == 2

gen university_male = university if sex == 1
replace university_male = university_partner_max if sex == 2
replace university_male = . if sex == 2 & sex_partner_max == 2

gen birth_country_male = birth_country if sex == 1
replace birth_country_male = birth_country_partner_max if sex == 2
replace birth_country_male = . if sex == 2 & sex_partner_max == 2

gen yob_male = yob if sex == 1
replace yob_male = yob_partner_max if sex == 2
replace yob_male = . if sex == 2 & sex_partner_max == 2

gen yob_male_squ = yob_male^2 
gen yob_male_cub = yob_male^2 

*Female variables
*Note: If female same sex partners, take the female variable from the longer lived person. If male same sex partners, leave blank.

gen school_female = school if sex == 2
replace school_female = school_partner_max if sex == 1
replace school_female = . if sex == 1 & sex_partner_max == 1

gen university_female = university if sex == 2
replace university_female = university_partner_max if sex == 1
replace university_female = . if sex == 1 & sex_partner_max == 1

gen birth_country_female = birth_country if sex == 2
replace birth_country_female = birth_country_partner_max if sex == 1
replace birth_country_female = . if sex == 1 & sex_partner_max == 1

gen yob_female = yob if sex == 2
replace yob_female = yob_partner_max if sex == 1
replace yob_female = . if sex == 1 & sex_partner_max == 1

gen yob_female_squ = yob_female^2 
gen yob_female_cub = yob_female^2 

*************************************
*** Create couple-level variables ***
*************************************

gen wave_squ = wave^2 //wave squared
gen wave_cub = wave^3 //wave cubed

gen child_63_73 = 0 //flag for having a child in the child cohort
forvalues x = 1/14{
gen ncage`x'_yob = wave + 2000 - ncage`x'
replace child_63_73 = 1 if ncage`x'_yob >= 1944 &  ncage`x'_yob <= 1954
}
forvalues x = 1/13{
gen rcage`x'_yob = wave + 2000 - rcage`x'
replace child_63_73 = 1 if rcage`x'_yob >= 1944 &  rcage`x'_yob <= 1954
}

gen children = tcnr //number of children

**********************************
*** Drop unwanted observations ***
**********************************
*Aim is to isolate a sample of responding people with similar characteristics to the parent cohort (call these the matched parent cohort).

egen yodeath_partner_max = max(yodeath_partner), by(xwaveid)
drop if yodeath_partner_max == -1 | yodeath_partner_max == . //drop people who don't have a partner who has died by 2019 (both people who never had a partner and people whose partner is still alive
drop if yodeath_partner_max >= wave + 2000 //drop people whose partner has now died but not at time of record

*keep only last recorded entry in a wealth year

gen wealth_wave = 0 //keep only last recorded entry in a wave that wealth was recorded in (waves 2, 6, 10, 14, 18)
replace wealth_wave = 1 if wave == 2 | wave == 6 | wave == 10 | wave == 14 | wave == 18 
egen max_wave = max(wave) if wealth_wave == 1, by(xwaveid)
drop if wave != max_wave


****************************
*** Fit model **************
****************************

*****model 1 coefficients mostly significant and directions as expected, except some female variables (year and country of birth) do not seem to have any explanatory power - try removing them
reg net_worth yob_male yob_female yob_male_squ yob_female_squ wave wave_squ i.school_male i.school_female university_male university_female i.birth_country_male i.birth_country_female child_63_73 [pweight = hhwtrps] if children > 0

*****model 2 (model 3 with female year and country of birth variables removed) - all good, except wave_squ has no explanatory power
reg net_worth yob_male yob_male_squ wave wave_squ i.school_male i.school_female university_male university_female i.birth_country_male child_63_73 [pweight = hhwtrps] if children > 0

*****model 3 (model 2 with wave_squ removed) - all coefficients look OK
reg net_worth yob_male yob_male_squ wave i.school_male i.school_female university_male university_female i.birth_country_male child_63_73 [pweight = hhwtrps] if children > 0
estimates store model_3

esttab model_3, se(a3) star(* 0.1 ** 0.05 *** 0.01) scsv //table B.1

predict parent_WAD_expected_orig //this is used later in this .do file
save IWP_data_3, replace

************************************************
*** Apply model to actual data on parents  *****
************************************************
use IWP_data_1.dta, replace

gen wave_squ = wave^2

predict parent_WAD_expected //predicted values based on model_3 above
codebook parent_WAD_expected //note 75 missing values because insufficient information about parents known to predict

gen parent_WAD_expected_scaled = parent_WAD_expected/1000 //scale by 1000 as this makes the parameters in the upcoming probit models easier to interpret 

********************************************************************************
*** Tests for whether parent_WAD_expected explains the zeros of parent_WAD *****
********************************************************************************

gen parent_WAD_0 = . //indicator for whether parent_WAD (from IWP 1.do) is zero (= 1) or not (= 0)
replace parent_WAD_0 = 1 if parent_WAD == 0
replace parent_WAD_0 = 0 if parent_WAD > 0 & parent_WAD != .

probit parent_WAD_0 parent_WAD_expected_scaled //table B.2, model 1 - negative, statically significant
estimates store model_1

probit parent_WAD_0 parent_WAD_expected_scaled if parent_WAD_expected <= 0 //table B.2, model 2 - negative, marginally statistically significant
estimates store model_2

probit parent_WAD_0 parent_WAD_expected_scaled if parent_WAD_expected > 0 //table B.2, model 3 - negative, not statistically significant
estimates store model_3

codebook parent_WAD_0 if parent_WAD_expected <= 0 // 87 of 142 cases - 61% of cases
codebook parent_WAD_0 if parent_WAD_expected > 0 // 244 of 423 cases - 58% of cases

*This is consistent with the zeros in parent_WAD being random where parent_WAD_expected is large but non-random where paren_WAD is not large. See discussion in section B.1.

************************
*** Create samples *****
************************

*Note: Approach is to construct a representative sample of parent_WAD (and alternatives, for robustness checks) by including the non-zero parent_WADs, and a small number of the parent_WADs for which parent_WAD_expected is lowest (as these are the mostly likely to the cases where the child genuinely did not receive an inheritance). See section B.1 for more information. 

codebook parent_WAD_0
codebook parent_WAD_expected

*There are 243 non-zero observations of parent_WAD, so the number of zeros to add in is 243 * x / (1-x), where x is the desired share of zeros in the sample (in decimal terms). As explained in section B.1, we use x = 0.1 as the central estimate, with x = 0.02 and x = 0.2 as robustness checks.
*With x = 0.02, add 5 observations in
*With x = 0.1, add 27 observations in
*With x = 0.2, add 61 observations in

gen parent_WAD_0_neg = -parent_WAD_0
sort parent_WAD_0_neg parent_WAD_expected //sorted so that _n = 1 is the lowest value of parent_WAD_expected that is a zero of parent_WAD, _n = 2 is the second lowest value of parent_WAD_expected that is a zero of parent_WAD etc.
gen parent_WAD_expected_rank = _n
gen sample_2pc_zero = 0 //put 5 obs in
replace sample_2pc_zero = 1 if parent_WAD > 0 | parent_WAD_expected_rank <= 5
gen sample_10pc_zero = 0 //put 27 obs in
replace sample_10pc_zero = 1 if parent_WAD > 0 | parent_WAD_expected_rank <= 27
gen sample_20pc_zero = 0 //put 61 obs in
replace sample_20pc_zero = 1 if parent_WAD > 0 | parent_WAD_expected_rank <= 61

******************************************************************************************
*** Tests for whether parent_WAD_expected explains the remaining zeros of parent_WAD *****
******************************************************************************************

*If the samples determined above are representative of the population, the remaining zeros of parent_WAD should be random (and, hence, not correlated with parent_WAD_expected).

probit parent_WAD_0 parent_WAD_expected_scaled if parent_WAD_expected_rank > 5 //table B.2, model 4 - negative, still statistically significant
estimates store model_4

probit parent_WAD_0 parent_WAD_expected_scaled if parent_WAD_expected_rank > 27 //table B.2, model 5 - negative, not statistically significant
estimates store model_5

probit parent_WAD_0 parent_WAD_expected_scaled if parent_WAD_expected_rank > 61 //table B.2, model 6 - positive, not statistically significant
estimates store model_6

save IWP_data_4.dta, replace

************************************************************
*** Comparison of samples with the matched parent cohort ***
************************************************************

summarize parent_WAD [aweight = wlra_s] if sample_2pc_zero == 1, d //table B.3, column 2
summarize parent_WAD [aweight = wlra_s] if sample_10pc_zero == 1, d //table B.3, column 4
summarize parent_WAD [aweight = wlra_s] if sample_20pc_zero == 1, d //table B.3, column 6

*Create weights to facilitate comparison across waves - need the share of the wlra_s observed at each wave for each sample
egen total_wlra_s_wave_s2 = total(wlra_s) if sample_2pc_zero == 1, by(wave)
egen total_wlra_s_wave_s10 = total(wlra_s) if sample_10pc_zero == 1, by(wave)
egen total_wlra_s_wave_s20 = total(wlra_s) if sample_20pc_zero == 1, by(wave)
egen total_wlra_s_s2 = total(wlra_s) if sample_2pc_zero == 1
egen total_wlra_s_s10 = total(wlra_s) if sample_10pc_zero == 1
egen total_wlra_s_s20 = total(wlra_s) if sample_20pc_zero == 1
gen share_wlra_s_wave_s2 = total_wlra_s_wave_s2 / total_wlra_s_s2 if sample_2pc_zero == 1
gen share_wlra_s_wave_s10 = total_wlra_s_wave_s10 / total_wlra_s_s10 if sample_10pc_zero == 1
gen share_wlra_s_wave_s20 = total_wlra_s_wave_s20 / total_wlra_s_s20 if sample_20pc_zero == 1
egen max_share_wlra_s_wave_s2 = max(share_wlra_s_wave_s2), by(wave)
egen max_share_wlra_s_wave_s10 = max(share_wlra_s_wave_s10), by(wave)
egen max_share_wlra_s_wave_s20 = max(share_wlra_s_wave_s20), by(wave)

keep max_share_wlra_s_wave* wave //keep only the wave and its corresponding weight for each sample
duplicates drop 

merge 1:m wave using IWP_data_3.dta //from earlier - this is the matched parent cohort

gen weight_s2 = hhwtrps*max_share_wlra_s_wave_s2 //weights - the product of the survey weights and the weights derived earlier
gen weight_s10 = hhwtrps*max_share_wlra_s_wave_s10
gen weight_s20 = hhwtrps*max_share_wlra_s_wave_s20

summarize net_worth [aweight = weight_s2] if child_63_73 == 1,d //table B.3, column 3
summarize net_worth [aweight = weight_s10] if child_63_73 == 1,d //table B.3, column 5
summarize net_worth [aweight = weight_s20] if child_63_73 == 1,d //table B.3, column 7
